Outline

dplyr verbs

There are five primary dplyr verbs, representing distinct data analysis tasks:

Filter

data(french_fries, package = "reshape2")
french_fries %>%
    filter(subject == 3, time == 1)
#>   time treatment subject rep potato buttery grassy rancid painty
#> 1    1         1       3   1    2.9     0.0    0.0    0.0    5.5
#> 2    1         1       3   2   14.0     0.0    0.0    1.1    0.0
#> 3    1         2       3   1   13.9     0.0    0.0    3.9    0.0
#> 4    1         2       3   2   13.4     0.1    0.0    1.5    0.0
#> 5    1         3       3   1   14.1     0.0    0.0    1.1    0.0
#> 6    1         3       3   2    9.5     0.0    0.6    2.8    0.0

Arrange

french_fries %>%
    arrange(desc(rancid)) %>%
    head
#>   time treatment subject rep potato buttery grassy rancid painty
#> 1    9         2      51   1    7.3     2.3      0   14.9    0.1
#> 2   10         1      86   2    0.7     0.0      0   14.3   13.1
#> 3    5         2      63   1    4.4     0.0      0   13.8    0.6
#> 4    9         2      63   1    1.8     0.0      0   13.7   12.3
#> 5    5         2      19   2    5.5     4.7      0   13.4    4.6
#> 6    4         3      63   1    5.6     0.0      0   13.3    4.4

Select

french_fries %>%
    select(time, treatment, subject, rep, potato) %>%
    head
#>    time treatment subject rep potato
#> 61    1         1       3   1    2.9
#> 25    1         1       3   2   14.0
#> 62    1         1      10   1   11.0
#> 26    1         1      10   2    9.9
#> 63    1         1      15   1    1.2
#> 27    1         1      15   2    8.8

Summarise

french_fries %>%
    summarise(mean_rancid = mean(rancid, na.rm=TRUE), 
              sd_rancid = sd(rancid, na.rm = TRUE))
#>   mean_rancid sd_rancid
#> 1     3.85223  3.781815

Summarise and Group_by

french_fries %>%
    group_by(time, treatment) %>%
    summarise(mean_rancid = mean(rancid), sd_rancid = sd(rancid))
#> Source: local data frame [30 x 4]
#> Groups: time [?]
#> 
#>      time treatment mean_rancid sd_rancid
#>    <fctr>    <fctr>       <dbl>     <dbl>
#> 1       1         1    2.758333  3.212870
#> 2       1         2    1.716667  2.714801
#> 3       1         3    2.600000  3.202037
#> 4       2         1    3.900000  4.374730
#> 5       2         2    2.141667  3.117540
#> 6       2         3    2.495833  3.378767
#> 7       3         1    4.650000  3.933358
#> 8       3         2    2.895833  3.773532
#> 9       3         3    3.600000  3.592867
#> 10      4         1    2.079167  2.394737
#> # ... with 20 more rows

Let’s use these tools

to answer these french fry experiment questions:

Completeness

If the data is complete it should be 12 x 10 x 3 x 2, that is, 6 records for each person. (Assuming that each person rated on all scales.)

To check this we want to tabulate the number of records for each subject, time and treatment. This means select appropriate columns, tabulate, count and spread it out to give a nice table.

french_fries %>% 
  select(subject, time, treatment) %>% 
  tbl_df() %>% 
  count(subject, time) %>%
  spread(time, n)
#> Source: local data frame [12 x 11]
#> Groups: subject [12]
#> 
#>    subject     1     2     3     4     5     6     7     8     9    10
#> *   <fctr> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int>
#> 1        3     6     6     6     6     6     6     6     6     6    NA
#> 2       10     6     6     6     6     6     6     6     6     6     6
#> 3       15     6     6     6     6     6     6     6     6     6     6
#> 4       16     6     6     6     6     6     6     6     6     6     6
#> 5       19     6     6     6     6     6     6     6     6     6     6
#> 6       31     6     6     6     6     6     6     6     6    NA     6
#> 7       51     6     6     6     6     6     6     6     6     6     6
#> 8       52     6     6     6     6     6     6     6     6     6     6
#> 9       63     6     6     6     6     6     6     6     6     6     6
#> 10      78     6     6     6     6     6     6     6     6     6     6
#> 11      79     6     6     6     6     6     6     6     6     6    NA
#> 12      86     6     6     6     6     6     6     6     6    NA     6

Check completeness with different scales, too

french_fries %>% 
  gather(type, rating, -subject, -time, -treatment, -rep) %>%
  select(subject, time, treatment, type) %>% 
  tbl_df() %>% 
  count(subject, time) %>%
  spread(time, n)
#> Source: local data frame [12 x 11]
#> Groups: subject [12]
#> 
#>    subject     1     2     3     4     5     6     7     8     9    10
#> *   <fctr> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int>
#> 1        3    30    30    30    30    30    30    30    30    30    NA
#> 2       10    30    30    30    30    30    30    30    30    30    30
#> 3       15    30    30    30    30    30    30    30    30    30    30
#> 4       16    30    30    30    30    30    30    30    30    30    30
#> 5       19    30    30    30    30    30    30    30    30    30    30
#> 6       31    30    30    30    30    30    30    30    30    NA    30
#> 7       51    30    30    30    30    30    30    30    30    30    30
#> 8       52    30    30    30    30    30    30    30    30    30    30
#> 9       63    30    30    30    30    30    30    30    30    30    30
#> 10      78    30    30    30    30    30    30    30    30    30    30
#> 11      79    30    30    30    30    30    30    30    30    30    NA
#> 12      86    30    30    30    30    30    30    30    30    NA    30

Change in ratings over weeks, relative to experimental design

Add means over reps, and connect the dots

ff.m.av <- ff.m %>% 
  group_by(subject, time, type, treatment) %>%
  summarise(rating=mean(rating))
ggplot(data=ff.m, aes(x=time, y=rating, colour=treatment)) + 
  facet_grid(subject~type) +
  geom_line(data=ff.m.av, aes(group=treatment))

Your turn

Write an answer to each of the questions:

Working with lots of models

Why would we even do that???

First Look

library(gapminder)
ggplot(data=gapminder, aes(x=year, y=lifeExp, group=country)) + 
  geom_line()

How would you describe this plot?

First Look

ggplot(data=gapminder, aes(x=year, y=lifeExp, group=country, 
                           color=continent, shape=continent)) + 
  geom_line()

How about now?

Using models as exploratory tools

Prepping

library(purrr)

Life Expectancy in the U.S.

usa <- gapminder %>% filter(country=="United States")
head(usa)
#> # A tibble: 6 x 6
#>         country continent  year lifeExp       pop gdpPercap
#>          <fctr>    <fctr> <dbl>   <dbl>     <int>     <dbl>
#> 1 United States  Americas     2   68.44 157553000  13990.48
#> 2 United States  Americas     7   69.49 171984000  14847.13
#> 3 United States  Americas    12   70.21 186538000  16173.15
#> 4 United States  Americas    17   70.76 198712000  19530.37
#> 5 United States  Americas    22   71.34 209896000  21806.04
#> 6 United States  Americas    27   73.38 220239000  24072.63

Life Expectancy in the U.S. since 1950

ggplot(data=usa, aes(x=year, y=lifeExp)) + 
  geom_point() +
  geom_smooth(method="lm")

Life Expectancy in the U.S. since 1950

usa.lm <- lm(lifeExp~year, data=usa)
usa.lm
#> 
#> Call:
#> lm(formula = lifeExp ~ year, data = usa)
#> 
#> Coefficients:
#> (Intercept)         year  
#>     68.0455       0.1842

Intercept is estimated life expectancy at 0 BC - let’s use 1950 for the first value:

gapminder <- gapminder %>% mutate(year = year-1950)

Nesting data

We don’t want to subset the data for every country …

nest() makes a data frame part of another data frame:

by_country <- gapminder %>% group_by(continent, country) %>% nest()
by_country %>% glimpse
#> Observations: 142
#> Variables: 3
#> $ continent <fctr> Asia, Europe, Africa, Africa, Americas, Oceania, Eu...
#> $ country   <fctr> Afghanistan, Albania, Algeria, Angola, Argentina, A...
#> $ data      <list> [<c("-1948", "-1943", "-1938", "-1933", "-1928", "-...

Each element of the data variable in by_country is a dataset:

head(by_country$data[[1]])
#> # A tibble: 6 x 4
#>    year lifeExp      pop gdpPercap
#>   <dbl>   <dbl>    <int>     <dbl>
#> 1 -1948  28.801  8425333  779.4453
#> 2 -1943  30.332  9240934  820.8530
#> 3 -1938  31.997 10267083  853.1007
#> 4 -1933  34.020 11537966  836.1971
#> 5 -1928  36.088 13079460  739.9811
#> 6 -1923  38.438 14880372  786.1134
lm(lifeExp~year, data=by_country$data[[1]])
#> 
#> Call:
#> lm(formula = lifeExp ~ year, data = by_country$data[[1]])
#> 
#> Coefficients:
#> (Intercept)         year  
#>    566.2475       0.2753

Fitting multiple models

Now we are using the map function in the package purrr.

map allows us to apply a function to each element of a list.

by_country$model <- by_country$data %>% 
  purrr::map(~lm(lifeExp~year, data=.))
head(by_country)
#> # A tibble: 6 x 4
#>   continent     country              data    model
#>      <fctr>      <fctr>            <list>   <list>
#> 1      Asia Afghanistan <tibble [12 x 4]> <S3: lm>
#> 2    Europe     Albania <tibble [12 x 4]> <S3: lm>
#> 3    Africa     Algeria <tibble [12 x 4]> <S3: lm>
#> 4    Africa      Angola <tibble [12 x 4]> <S3: lm>
#> 5  Americas   Argentina <tibble [12 x 4]> <S3: lm>
#> 6   Oceania   Australia <tibble [12 x 4]> <S3: lm>
summary(by_country$model[[1]])
#> 
#> Call:
#> lm(formula = lifeExp ~ year, data = .)
#> 
#> Residuals:
#>     Min      1Q  Median      3Q     Max 
#> -1.5447 -0.9905 -0.2757  0.8847  1.6868 
#> 
#> Coefficients:
#>              Estimate Std. Error t value Pr(>|t|)    
#> (Intercept) 566.24755   39.27760   14.42 5.12e-08 ***
#> year          0.27533    0.02045   13.46 9.84e-08 ***
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> Residual standard error: 1.223 on 10 degrees of freedom
#> Multiple R-squared:  0.9477, Adjusted R-squared:  0.9425 
#> F-statistic: 181.2 on 1 and 10 DF,  p-value: 9.835e-08

On to the broom package

broom allows to extract values from models on three levels:

broom::glance(by_country$model[[1]])
#>   r.squared adj.r.squared    sigma statistic      p.value df    logLik
#> 1 0.9477123     0.9424835 1.222788  181.2494 9.835213e-08  2 -18.34693
#>        AIC      BIC deviance df.residual
#> 1 42.69387 44.14859  14.9521          10
broom::tidy(by_country$model[[1]])
#>          term    estimate   std.error statistic      p.value
#> 1 (Intercept) 566.2475466 39.27760415  14.41655 5.115882e-08
#> 2        year   0.2753287  0.02045093  13.46289 9.835213e-08

broom::augment(by_country$model[[1]])
#>    lifeExp  year  .fitted   .se.fit      .resid       .hat   .sigma
#> 1   28.801 -1948 29.90729 0.6639995 -1.10629487 0.29487179 1.211813
#> 2   30.332 -1943 31.28394 0.5799442 -0.95193823 0.22494172 1.237512
#> 3   31.997 -1938 32.66058 0.5026799 -0.66358159 0.16899767 1.265886
#> 4   34.020 -1933 34.03722 0.4358337 -0.01722494 0.12703963 1.288917
#> 5   36.088 -1928 35.41387 0.3848726  0.67413170 0.09906760 1.267003
#> 6   38.438 -1923 36.79051 0.3566719  1.64748834 0.08508159 1.154002
#> 7   39.854 -1918 38.16716 0.3566719  1.68684499 0.08508159 1.147076
#> 8   40.822 -1913 39.54380 0.3848726  1.27820163 0.09906760 1.208243
#> 9   41.674 -1908 40.92044 0.4358337  0.75355828 0.12703963 1.260583
#> 10  41.763 -1903 42.29709 0.5026799 -0.53408508 0.16899767 1.274051
#> 11  42.129 -1898 43.67373 0.5799442 -1.54472844 0.22494172 1.148593
#> 12  43.828 -1893 45.05037 0.6639995 -1.22237179 0.29487179 1.194109
#>         .cooksd  .std.resid
#> 1  2.427205e-01 -1.07742164
#> 2  1.134714e-01 -0.88428127
#> 3  3.603567e-02 -0.59530844
#> 4  1.653992e-05 -0.01507681
#> 5  1.854831e-02  0.58082792
#> 6  9.225358e-02  1.40857509
#> 7  9.671389e-02  1.44222437
#> 8  6.668277e-02  1.10129103
#> 9  3.165567e-02  0.65958143
#> 10 2.334344e-02 -0.47913530
#> 11 2.987950e-01 -1.43494020
#> 12 2.963271e-01 -1.19046907

Extract values for each coefficient

Extract all countries automatically (hello map again!)

by_country$stats <- by_country$model %>% purrr::map(broom::tidy)
by_country_coefs <- by_country %>% unnest(stats)
coefs <- by_country_coefs %>% 
  select(country, continent, term, estimate) %>% 
  spread(term, estimate)

and finally, a visualization:

ggplot(data=coefs, aes(x=`(Intercept)`, y=year, colour=continent)) +
  geom_point()

Extract other model diagnostics

by_country <- by_country %>% unnest(model %>% purrr::map(broom::glance))
by_country$country <- reorder(by_country$country, by_country$r.squared)
ggplot(data=by_country, aes(x=country, y=r.squared, colour=continent)) +
  geom_point() +
  theme(axis.text.x=element_text(angle=-90, hjust=0)) +
  scale_colour_brewer(palette="Dark2")

Compare groups of countries

country_all <- by_country %>% unnest(data)
ggplot(data=filter(country_all, r.squared <= 0.45), 
       aes(x=year+1950, y=lifeExp)) +
  geom_line() +
  facet_wrap(~country)

What do the patterns mean?


ggplot(data=subset(country_all, between(r.squared, 0.45, 0.75)), 
       aes(x=year+1950, y=lifeExp)) +
  geom_line() +
  facet_wrap(~country)

Your turn

by_country.back <- by_country
by_country$augment <- by_country$model %>% purrr::map(broom::augment)
country_all <- by_country %>% unnest(data, augment)
ggplot(country_all, aes(x=lifeExp, y=.resid)) + geom_point() +
  geom_smooth()

Overall, a linear fit is not great: at low and high life expectancy, linear fit is too fast

broom for bio data

biobroom package (Bioconductor; Bass, Nelson, Robinson, Storey, 2016) has the same functions as broom, i.e. glance, tidy, and augment.

These functions provide information depending on the input type

library(biobroom)
data(hammer)

counts <- Biobase::exprs(hammer)
head(counts)
#>                    SRX020102 SRX020103 SRX020104 SRX020105 SRX020091-3
#> ENSRNOG00000000001         2         4        18        24           7
#> ENSRNOG00000000007         4         1         3         1           5
#> ENSRNOG00000000008         0         1         4         2           0
#> ENSRNOG00000000009         0         0         0         0           0
#> ENSRNOG00000000010        19        10        19        13          50
#> ENSRNOG00000000012         7         5         1         0          31
#>                    SRX020088-90 SRX020094-7 SRX020098-101
#> ENSRNOG00000000001            4          93            77
#> ENSRNOG00000000007            4           9             4
#> ENSRNOG00000000008            5           2             6
#> ENSRNOG00000000009            0           0             0
#> ENSRNOG00000000010           57          45            58
#> ENSRNOG00000000012           26          12             9

The Hammer study

published as part of the biobroom package, part of the ReCount project

# more information about the study:
Biobase::phenoData(hammer)@data
#>                   sample.id num.tech.reps protocol         strain     Time
#> SRX020102         SRX020102             1  control Sprague Dawley 2 months
#> SRX020103         SRX020103             2  control Sprague Dawley 2 months
#> SRX020104         SRX020104             1   L5 SNL Sprague Dawley 2 months
#> SRX020105         SRX020105             2   L5 SNL Sprague Dawley  2months
#> SRX020091-3     SRX020091-3             1  control Sprague Dawley  2 weeks
#> SRX020088-90   SRX020088-90             2  control Sprague Dawley  2 weeks
#> SRX020094-7     SRX020094-7             1   L5 SNL Sprague Dawley  2 weeks
#> SRX020098-101 SRX020098-101             2   L5 SNL Sprague Dawley  2 weeks

biobroom and edgeR

identify differentially expressed genes following the protocol by Robinson, McCarthy and Smyth 2009

library(edgeR)
y <- DGEList(counts = counts, group=Biobase::phenoData(hammer)@data$protocol)
y <- calcNormFactors(y)
y <- estimateCommonDisp(y)
y <- estimateTagwiseDisp(y)
et <- exactTest(y)

glance(et, alpha = 0.05) # glance on DGEExact
#>   significant     comparison
#> 1        6993 control/L5 SNL

biobroom and edgeR

tet <- tidy(et)
tet$significant <- tet$p.value < 0.05
ggplot(data=tet, aes(x=logCPM, y=estimate, colour=significant)) +
  geom_point(alpha=.1) + facet_wrap(~significant, labeller="label_both")

augment(y) stops in an error (this is a bug and reported)

biobroom and limma

bb <- data.frame(read_tsv("./data/biotin-rma2.txt"))
head(data.frame(bb[,-2]))
#>         Gene biotin.WT.01.1 biotin.bio101.4 biotin.WT.B1.2 biotin.bio1B1.3
#> 1   11986_at       6.359180        6.004075       6.325338        6.255888
#> 2   11987_at       7.833549        6.666034       7.738628        7.691321
#> 3   11988_at       6.145599        6.128749       6.258157        6.353912
#> 4   11989_at       5.675039        5.078695       5.444967        5.380756
#> 5   11990_at       5.382078        5.241411       5.209638        5.357740
#> 6 11991_g_at       5.389809        5.087797       5.403933        5.592901
#>   biotin.WT.02.1 biotin.bio102.4 biotin.WT.B2.2 biotin.bio1B2.3
#> 1       6.554727        6.144557       6.098169        6.201624
#> 2       7.461711        7.300383       7.353033        7.298990
#> 3       6.097703        6.183593       6.184723        6.276867
#> 4       5.258524        5.328962       5.460898        5.203674
#> 5       5.337013        5.276889       5.434531        5.362002
#> 6       5.571362        5.249382       5.596683        5.140585
row.names(bb) <- bb$Gene

Looking at the gene expression data

ggpairs(bb, columns=c(3,7,4,8))

A porcupine plot again

sub <- bb %>% select(Gene, biotin.WT.01.1, biotin.WT.02.1, biotin.bio101.4, biotin.bio102.4)
ggplot(sub, aes(x=biotin.WT.01.1, xend=biotin.WT.02.1, y=biotin.bio101.4, yend=biotin.bio102.4)) +
  geom_segment() +
  theme(aspect.ratio = 1) +
  xlab("wildtype, control treatment") +
  ylab("mutant, treated")

Fit a limma model

design <- expand.grid(type=c("wild", "mutant"), trt=c("control", "treatment"), rep=1:2)

fit <- lmFit(bb[,-(1:2)], model.matrix(~ type*trt, design))
fit <- eBayes(fit)

head(topTable(fit))
#>            typemutant trttreatment typemutant.trttreatment  AveExpr
#> 13212_s_at   3.436575    0.1501056               -3.282545 6.383540
#> 13449_at     2.498316    0.1163889               -2.488131 5.287509
#> 14609_at     2.301786   -0.2724221               -1.962619 5.406537
#> 16016_at    -3.749019    1.2864088                3.338380 7.858121
#> 18255_at     1.696560   -0.3977035               -1.326075 6.844090
#> 15162_at     2.699042    0.1022273               -2.354713 8.408859
#>                    F      P.Value    adj.P.Val
#> 13212_s_at 297.14511 8.048043e-08 0.0006677461
#> 13449_at   154.05666 8.051126e-07 0.0033400095
#> 14609_at   129.24605 1.484583e-06 0.0041058604
#> 16016_at   108.19217 2.752579e-06 0.0048917219
#> 18255_at   106.07250 2.947886e-06 0.0048917219
#> 15162_at    91.58383 4.898100e-06 0.0067732558

Your Turn

For the previous example, try out what output the different broom functions (glance, tidy, augment) produce. Create a Volcano plot for each of the model terms, i.e. plot estimates on x by log(p.values) on y. Are there differences visible between the terms?

head(tidy(fit))
#> # A tibble: 6 x 6
#>         gene       term    estimate  statistic    p.value       lod
#>        <chr>      <chr>       <dbl>      <dbl>      <dbl>     <dbl>
#> 1   11986_at typemutant -0.38263784 -2.9258469 0.02180752 -3.500735
#> 2   11987_at typemutant -0.66442177 -2.5373489 0.03836594 -4.069831
#> 3   11988_at typemutant  0.03451959  0.3393213 0.74418455 -6.556257
#> 4   11989_at typemutant -0.26295371 -1.5704944 0.15969599 -5.437119
#> 5   11990_at typemutant -0.10039542 -0.8814401 0.40693026 -6.207405
#> 6 11991_g_at typemutant -0.31199630 -1.7904790 0.11590003 -5.142913
ggplot(tidy(fit), aes(x=estimate, y=log(p.value), colour = p.value < 0.05)) +
  facet_wrap(~term) +
  geom_point() + ggtitle("Volcano Plots with limma") 

bbfit <- tidy(fit)
ggplot(data=bbfit, aes(x=term, y=estimate, group=gene)) +
  geom_line(alpha=0.1) +
  geom_point(aes(color=log(p.value)), size=2, alpha=0.6)

Is type*treatment interaction necessary? Very strong negative correlation is suspicious.

bbfit_m <- bbfit %>% select(gene, term, estimate, p.value) %>% 
  gather(fit.stat, value, -gene, -term) %>%
  unite(term_stat, term, fit.stat) %>%
  spread(term_stat, value) %>% 
  rename(trt=trttreatment_estimate, mut=typemutant_estimate,
         int=`typemutant:trttreatment_estimate`, 
         trtp=trttreatment_p.value, mutp=typemutant_p.value, 
         intp=`typemutant:trttreatment_p.value`) 
ggpairs(bbfit_m, columns=c(2,4,6), upper=list(continuous="points"),
          ggplot2::aes(colour=intp)) + theme(aspect.ratio=1)

ggpairs(bbfit_m, columns=c(2,4,6), upper=list(continuous="points"),
          ggplot2::aes(colour=intp)) + theme(aspect.ratio=1)

fit2 <- lmFit(bb[,-(1:2)], model.matrix(~ type+trt, design))
fit2 <- eBayes(fit2)
bbfit2 <- tidy(fit2)
ggplot(data=bbfit2, aes(x=term, y=estimate, group=gene)) +
  geom_line(alpha=0.1) +
  geom_point(aes(color=log(p.value)), size=2, alpha=0.6)

Share and share alike

This work is licensed under the Creative Commons Attribution-Noncommercial 3.0 United States License. To view a copy of this license, visit http://creativecommons.org/licenses/by-nc/ 3.0/us/ or send a letter to Creative Commons, 171 Second Street, Suite 300, San Francisco, California, 94105, USA.